/*----------------------------------------------------------------------*/
/* Name:         read merged data v17.sas                                   */
/* Purpose:      Read in a series of routes and process them for analysis. */
/*               Run and save the analysis for a paper on errors in paradata. */
/*                                                                            */
/* Created:      v01 2013-04-02                                                    */
/* Notes:        samplehouse_spjoin_routes_allq1s2_with_speed_071614_gmt.csv is created by a series of Python scripts  */
/*----------------------------------------------------------------------*/

libname gps "<Path>\GPS Files";
libname calls "<Path>\GPS Files\calls";

/*Read in the Routes File.*/
PROC IMPORT OUT= WORK.routes 
           DATAFILE= "<path>\samplehouse_spjoin_routes_allq1s2_with_speed_071614_gmt.csv" 
            DBMS=CSV REPLACE ;
     GETNAMES=YES;
     DATAROW=2; 
     guessingrows=8000;
RUN;

proc sort data=routes; by combineid; run;

/*Get the date each line was finalized.*/
data lastday(keep=combineid final_date);
set gps.q1lastday;
combineid=substr(vsamplelineid,1,11); /*Truncate to the Sampled HU*/
run;

proc sort data=lastday nodupkey; by combineid; run;

/*Merge date of finalization onto the routes. 
  Format date and time variables. 
  Create indicator for an apartment.
  Code whether GPX data, callnote, both, or neither (i.e. sample line only) exist for each record.
  Create a common set of variables for dates and times for sorting purposes.
  Format sample design info into a format common across files.*/
data routes2;
merge routes(in=a) lastday;   
by combineid;
if a;
    /*Format date and time variables*/
    length newgpxtime newcallrtime newgpxtime 8 ;
	newgpxtime=gpxtime;
	newcallrtime=callrtime;
	newgpxdate = input(scan(gpxdate,2,'_'),ANYDTDTE8.);
	charcalldate=put(calldate,$8.);
	newcalldate=input(charcalldate,ANYDTDTE8.);
	if calldate=. then newcalldate=.;
    format newgpxtime newcallrtime timeampm11. newcalldate newgpxdate date10.;

	/*Add an indicator for apartment.*/
	if addr2 ne '' then apt=1; else apt=0; /*n=687*/

	/*code records into three classes*/
length rectype $15.;
if gpxid=0 and callid>0 then rectype='callnote only';
if gpxid>0 and callid=0 then rectype='gpx only';
if gpxid>0 and callid>0 then rectype='both';
if gpxid=0 and callid=0 then rectype='sample line';

/*The following sets up a combined date (GPX and Call Record dates) so that 
call records (with no merged GPX) between GPX points can be found.*/
date=newgpxdate;
if date=. then date=newcalldate;
format date date10.;
/*The following sets up a combined time (GPX and Call Record times) so that 
call records (with no merged GPX) between GPX points can be found.*/
combinetime=newgpxtime;
if combinetime=. then combinetime=newcallrtime;
format combinetime time8.;

/*Format Sample Design Info for merging to other files.*/
segment=vsamplelin;
psu=substr(segment,1,4);
nsfgid=substr(callnote,1,3);

run;



/*************************************************************************************/
/**Identify the appropriate subset of records for analysis. *******/
/* Delete cases that were finalized before the interviewer-day in question. */
/* Output active cases that weren't passed or called on the interviewer-day in question to a separate dataset: UNPASSED. */
/*************************************************************************************/
data routes3(keep=combineid near_dist near_fid total_leng gpxspd gpxid gpxtime callrtime calldate callid resultcode 
newgpxtime newgpxdate newcallrtime newcalldate apt rectype date segment psu nsfgid calcspd)
unpassed(keep=combineid near_dist near_fid total_leng gpxspd gpxid gpxtime callrtime calldate callid resultcode 
newgpxtime newgpxdate newcallrtime newcalldate odd apt rectype date segment psu nsfgid calcspd final_date);
set routes2;
/*Delete cases where we drove by a unit that was finalized.*/
if final_date lt newgpxdate and callid=0 then delete;
/*Output cases that weren't called or passed to a separate dataset*/
if gpxid=0 and callid=0 then output unpassed;
else output routes3; 
run;

/*Checking*/
proc freq data=routes3;
tables rectype callid;
run;

proc freq data=routes3;
tables combineid / noprint out=hhcount;
run;

/*Now check to see if the interviewer-day-segment combination includes either GPX or Callnote data*/
/*Create counts of calls and GPX points for each interviewer-day-segment combination.*/
proc sort data=routes3; by date segment; run;

data segday(keep=date segment calls gpxpoints);
set routes3;
by date segment;
if first.segment then do;
	calls=0; gpxpoints=0;
	end;
retain calls gpxpoints;
if newcalldate ne . then calls+1;
if newgpxdate ne . then gpxpoints+1;
if last.segment then output;
run;

/*Create a file with interviewer-segment-days with no calls or gpx for use later*/
data probsegday;
set segday;
if calls=0 or gpxpoints=0;
run;

/*Now check to see if the interviewer-day-PSU combination includes either GPX or Callnote data*/
/*Create counts of calls and GPX points for each interviewer-day-PSU combination.*/
proc sort data=routes3; by date psu; run;

data psuday(keep=date psu calls gpxpoints);
set routes3;
by date psu;
if first.psu then do;
	calls=0; gpxpoints=0;
	end;
retain calls gpxpoints;
if newcalldate ne . then calls+1;
if newgpxdate ne . then gpxpoints+1;
if last.psu then output;
run;

/*Create a file with interviewer-psu-days with no calls or gpx for use later*/ 
data probpsuday;
set psuday;
if calls=0 or gpxpoints=0;
run; /*There are no days in this file that are not covered by the segment-days file. (PROBSEGDAY)*/

/*Delete any segment-days that had 0 GPX points.*/
/*This could have occured through the segment visit not being recorded due to failure or they turned it off.*/
/*These count against the GPX response rate (i.e. interviewer compliance)*/
proc sort data=routes3; by date segment;
proc sort data=probsegday; by date segment;

data routes4;
merge routes3(in=a) probsegday(in=b drop=calls gpxpoints);
by date segment;
if a and not b;
run;


/*In order to eliminate the duplicates, do the following*/
/*1. Sort by ID and CallID, then descending GPXTIME. This gets cases grouped together.*/
/*2. For each ID (except those that have blank CallID), if the call note time is greater than */
/*   the GPX time, then keep that and delete the remaining instances for that . Set that case=1.*/
proc sort data=routes4; by date combineid callid descending newgpxtime; run;

data routes5;
set routes4;
retain tmp;
by date combineid callid;
if first.callid then tmp=1;
if last.callid and tmp=0 then do;callid=0; resultcode=''; newcallrtime=.; end;
else if last.callid and tmp=1 then nomatch=1;
if newcallrtime>newgpxtime and tmp=1 then tmp=0;
else if not last.callid then do; callid=0; resultcode=''; newcallrtime=.; end;

if gpxid=0 and callid>0 then rectype='callnote only';
if gpxid>0 and callid=0 then rectype='gpx only';
if gpxid>0 and callid>0 then rectype='both';
if gpxid=0 and callid=0 then rectype='sample line';

/*The following variable sets up mismatches in time*/
diffminutes=abs(newgpxtime-newcallrtime)/60;
run;


proc freq data=routes5;
tables callid;
run;

/***************************************checking.*/
/*Only dupes should be callid=0*/
proc sort data=routes5 nodupkey out=test dupout=test1;
by date combineid callid;
run;


/*Now create list of interviewer IDs*/
/*Here, checking for MORE THAN ONE interviewer in a segment on a day*/
proc freq data=routes5;
tables date*segment*nsfgid /noprint out=iwerlist;
where nsfgid ne '';
run;

data iwerlist2;
set iwerlist(drop=count percent rename=(nsfgid=nsfgid2));
run;

proc sort data=routes5; by date segment;
proc sort data=iwerlist2; by date segment; run;

/*Checking for MORE THAN ONE interviewer in a segment on the same day.*/
/*Identify cases that were called using CALLID*/
/*Set up a contact variable*/
data routes6;
merge routes5 iwerlist2;
by date segment;
if nsfgid ne nsfgid2 and nsfgid ne '' then check=1;
if nsfgid='' then nsfgid=nsfgid2;
/*The following variable can be used to calculate the proportion of sampled HUs that were walked past.*/
/*1- mean(called) is the proportion walked past)*/
if callid=0 then called=0;
if callid>1 then called=1;
/*Now set up contact rate calculation variable. Set any case that has a disposition to noncontact.*/
if resultcode ne '' then contact=0;
/*Then code all contact dispositions to contact.*/
if resultcode in ('1001','4001','4002','4101','4201','4202','4301','4302','8010') then contact=1;
run;

proc freq data=routes6;
tables resultcode check; /*No segments had two interviewers in one day.*/
run;

proc summary data=routes6 nway;
class nsfgid;
var called contact;
output out=byiwer mean=;
run;


/*Merge on an indicator for whether the current call was a scheduled appointment.*/
data appts(drop=vsamplelineid ncallinfoid);
set calls.appts;
combineid=substr(vsamplelineid,1,11);
if substr(vsamplelineid,12,2) ne '11' then check=1;
callid=ncallinfoid;
run;

proc sort data=appts; by combineid callid;
proc sort data=routes6; by combineid callid; run;

data routes7;
merge routes6(in=a) appts;
by combineid callid;
if a;
run;

proc freq data=routes7;
tables appt_followup_day;
run;

/*In order to determine which routes are on the way to an appointment, I needed a simple rule. */
/*The rule is: Everything 30 minutes before the call (that doesn't have a call record) is the */
/*interviewer traveling to the appointment. Also, make sure that this applied within each interviewer */
/*day. Finally, need to get everything sorted by time. So, use GPX time as that is the time of the route. */
/*For call records with no route, just use the call record time and insert that into the GPX time field. */
data routes8;
set routes7;
if newgpxtime=. then newgpxtime=newcallrtime;
run;

proc sort data=routes8; by nsfgid2 date descending newgpxtime; run;

data routes9;
set routes8;
by nsfgid2 date;
if first.date then do;
	appt=0;
	appt_time=.;
	end;
trav_to_appt=0;
/*The following line is used to identify "short routes", that is routes less than a certain number of meters*/
if total_leng<15 then stationary=1; else stationary=0;
/*Check if the current GPX point is within 30 minutes of an appointment. Time is in seconds, so 30mins*60secs=1800 seconds*/
check_diff=appt_time-newgpxtime;
if appt=1 and check_diff le 1800 and resultcode='' then trav_to_appt=1;
retain appt appt_time;
if appt_followup_day=1 then do;
	appt_time=newgpxtime;
	appt=1;
	end;
run;

/*Create a file that deletes the "Call Record Only" cases.*/
data routes_only;
set routes9;
if rectype ne "callnote only";
run;


/*Get a list of all iwer-days*/
proc freq data=routes9;
tables date*segment /noprint out=segdays;
run;


/*Create a macro to go through a list of all the Interviewer-Segment-Days*/
/*For each interviewer-segment-day, delete cases that have been finalized, delete any duplicates, create a list of units that are not passed but active*/
%macro do_all;
%do i=1 %to 101;
data _null_;
set segdays;
if _n_=&i then do;
call symput('currday',put(date,date9.));
call symput('currseg',segment);
end;
run;
%put &currday;
%put &currseg;

data segdaytmp;
set unpassed(keep=combineid segment final_date apt);
if final_date gt "&currday"d and segment="&currseg";
segday=&i;
date="&currday"d;
if date>final_date then delete;
format date mmddyy10.;
count=0;
run;

proc sort data=segdaytmp nodupkey; by combineid; run;

proc append base=notpassed data=segdaytmp; run;

%end;
%mend;

%do_all;


/*Number of routes associated with each housing unit that is passed.*/
/*Here, numbers go into Table 4. Number of active units passed is in points. Number of active units passed excluding apartments
is in point2. There are 1638 notpassed units. They are all not apartments. */
/*SO percent who were ever passed is 1065/(1065+1638)=39%*/
/*If we exclude apartments, the percent who were ever passed is 945/(945+1638)=37%*/
proc freq data=routes_only;
tables newgpxdate*combineid /noprint out=points;
run;

proc freq data=routes_only;
tables newgpxdate*combineid /noprint out=points2;
where apt=0;
run;


data points;
set points notpassed;
run;

proc freq data=routes_only;
tables newgpxdate*combineid /noprint out=points_stat;
where stationary=0;
run;

data points_stat;
set points_stat notpassed;
run;


/**********************************/
/**********************************/
/* Report                         */
/**********************************/
/**********************************/
ods rtf file="<path>\evidence of driving by v17 2016-04-04.rtf";

title1 "Type of Record from Joint Merge of Routes and Call Records";
proc freq data=routes9;
tables rectype;
run;

title1 "Not Passed Units -- Apartment vs Not";
proc freq data=notpassed;
table apt;
run;

/*Now, get the number of times walked past an active sample unit without making a call*/
title1 "Routes Going by Active Sampled IDs with no Call record (0=no record)";
proc freq data=routes_only;
tables callid rectype;/*0=not called*/
run;

/*917 unique call records. How many unique IDs were called? */
title1 "Routes and Call Notes -- Number of HU's called";
proc freq data=routes5;
tables combineid /noprint out=uniqhus;
where callid>0;
run;

title1 "Routes only -- Number of HU's called";
proc freq data=routes_only;
tables combineid /noprint out=uniqhus;
where callid>0;
run;

title1 "Routes Going by Active Sampled IDs with no Call record (0=no record)";
title2 "EXCLUDING When the interviewer was on the way to an appt";
proc freq data=routes_only;
tables callid;/*0=not called*/
where trav_to_appt in (0,.);
run;

title1 "Routes Going by Active Sampled IDs with no Call record (0=no record)";
title2 "EXCLUDING Apartments";
title3; 
proc freq data=routes_only;
tables callid;/*0=not called*/
where apt=0;
run;

title1 "Number of apartments";
title2;
title3;
proc freq data=routes_only;
tables apt;
run;

title1 "Routes Going by Active Sampled IDs with no Call record (0=no record)";
title2 "EXCLUDING short routes (less than 15 meters)";
title3;
proc freq data=routes_only;
tables callid;/*0=not called*/
where stationary=0;
run;


title1 "Routes Going by Active Sampled IDs with no Call record (0=no record)";
title2 "EXCLUDING When the interviewer was on the way to an appt";
title3 "EXCLUDING Apartments";
proc freq data=routes_only;
tables callid;/*0=not called*/
where trav_to_appt in (0,.) and apt=0;
run;

title1 "Routes Going by Active Sampled IDs with no Call record (0=no record)";
title2 "EXCLUDING When the interviewer was on the way to an appt";
title3 "ALSO EXCLUDING short routes (less than 15 meters)";
proc freq data=routes_only;
tables callid;/*0=not called*/
where trav_to_appt=0 and stationary=0;
run;

title1 "Routes Going by Active Sampled IDs with no Call record (0=no record)";
title3 "EXCLUDING short routes (less than 15 meters)";
title3;
proc freq data=routes_only;
tables callid;/*0=not called*/
where stationary=0;
run;

title3;
title1 "Mean Speed by call record / no call record";
title2 "INCLUDING all cases";
proc means data=routes_only mean median stddev;
class called;
var calcspd;
run;

title1 "Mean Speed by call record / no call record";
title2 "EXCLUDING When the interviewer was on the way to an appt";
proc means data=routes_only mean median stddev;
class called;
var calcspd;
where trav_to_appt=0;
run;

title1 "Mean Speed by call record / no call record";
title2 "EXCLUDING When the interviewer was on the way to an appt or when calcspd ge 33.33 (120 Km/hr)";
proc means data=routes_only mean median stddev;
class called;
var calcspd;
where trav_to_appt=0 and calcspd<33.3333;
run;


title1 "Mean Speed by call record / no call record";
title2 "EXCLUDING When the interviewer was on the way to an appt or when calcspd ge 33.33 (120 Km/hr)";
title3 "EXCLUDING routes < 15 meters";
proc means data=routes_only mean median stddev;
class called;
var calcspd;
where trav_to_appt=0 and calcspd<33.3333 and stationary=0;
run;

title1 "Distribution of speeds by call record / no call record";
title2;
title3;
proc sort data=routes_only; by called;
proc univariate data=routes_only;
class called;
var calcspd;
histogram;
run;

title1;
title2;
%macro getspds(spd=1.5,fname=1_5);
proc freq data=routes_only;
tables callid /out=m&fname noprint;/*0=not called*/
where calcspd le &spd;
run;
%mend;
%getspds(spd=1.5,fname=1_5);
%getspds(spd=2.0,fname=2_0);
%getspds(spd=2.5,fname=2_5);
%getspds(spd=3.0,fname=3_0);
%getspds(spd=3.5,fname=3_5);
%getspds(spd=4.0,fname=4_0);

data spds;
set m1_5(in=a) m2_0(in=b) m2_5(in=c) m3_0(in=d)
m3_5(in=e) m4_0(in=f);
if callid=0;
if a then speed=1.5;
if b then speed=2.0;
if c then speed=2.5;
if d then speed=3.0;
if e then speed=3.5;
if f then speed=4.0;
run;

title1 "Percentage of Routing Going by Active Sampled IDs with No Call Record by Speed";
proc print data=spds noobs;
var speed count percent;
format speed 5.1;
run;


title1 "Routes Going by Active Sampled IDs with no Call record (0=no record) Speed=1.5m/sec or less";
title2 "EXCLUDING routes on the way to an appointment";
proc freq data=routes_only;
tables callid;/*0=not called*/
where calcspd le 1.5 and trav_to_appt=0;
run;

title1 "Routes Going by Active Sampled IDs with no Call record (0=no record) Speed=2.0m/sec or less";
title2 "EXCLUDING routes on the way to an appointment";
proc freq data=routes_only;
tables callid;/*0=not called*/
where calcspd le 2.0 and trav_to_appt=0;
run;

title1 "Routes Going by Active Sampled IDs with no Call record (0=no record) Speed=2.5m/sec or less";
title2 "EXCLUDING routes on the way to an appointment";
proc freq data=routes_only;
tables callid;/*0=not called*/
where calcspd le 2.5 and trav_to_appt=0;
run;


title1 "Routes Going by Active Sampled IDs with no Call record (0=no record) Speed=2.0m/sec or less";
title2 "EXCLUDING routes on the way to an appointment";
title3 "EXCLUSING routes < 15 meters long";
proc freq data=routes_only;
tables callid;/*0=not called*/
where calcspd le 2.0 and trav_to_appt=0 and stationary=0;
run;


title1 "Number of routes associated with each housing unit.";
title2 "Including routes < 15 meters long";
proc freq data=points;
tables count;
run;

title1 "Number of routes associated with each housing unit.";
title2 "EXCLUDING routes < 15 meters long";
proc freq data=points_stat;
tables count;
run;



/*Page 19. The following text: "Of course, as shown in Table 3, interviewers can pass housing units several times. 
An alternative explanation for the lack of call records is that interviewers only generate one call record for 
each housing unit that is passed several times. While this may be true in some cases, among active sampled housing 
units that are ever passed (i.e. passed 1+ times), 36% do not have a call record recorded at all for the day.  
Identifying the location of an apartment may be more difficult than a single family home. When we examine only 
non-apartments (e.g., single family homes), the percent of sampled housing units that are passed without a call 
record recorded for the day increases to 42%." 
These PROC FREQs get those numbers*/
proc sort data=routes_only; by combineid descending callid; run;
proc sort data=routes_only out=oneperhh nodupkey; by combineid; run;

title1 "Call Number for Each Houshold Ever Passed,";
title2 "Therefore, CALLID=0 is the percent of sampled HUs not called at all that day";
title3;
proc freq data=oneperhh;
tables callid;
run;

title1 "Call Number for Each Houshold Ever Passed,";
title2 "Therefore, CALLID=0 is the percent of sampled HUs not called at all that day";
title3 "Excluding Apartments";
proc freq data=oneperhh;
tables callid;
where apt=0;
run;

ods rtf close;

data routes_only2;
set routes_only;
if callid=0 then callrec=0;
if callid>0 then callrec=1;
if callid=. then callid=.;
/*create an iwer-day ID for cluster variable to use below*/
tripid=compress(segment||put(date,mmddyy10.));
run;

proc sort data=routes_only2; by combineid descending callid; run;
proc sort data=routes_only2 out=oneperhh2 nodupkey; by combineid; run;


proc freq data=routes_only2;
tables tripid /noprint out=trips;
run;

proc freq data=routes_only2;
tables callrec*callid /list missing;
run;

ods rtf file="<Path>\evidence of driving by v17 table 3.rtf";

title1 "Percent of Routes that pass a sampled housing unit";
title2 "TABLE: Comparison of GPS data and call record data";
title3 "ROW1: All routes";
proc surveyfreq data=routes_only2 rate=0.025;
tables callrec;
cluster tripid;

run;

title1 "Percent of Routes that pass a sampled housing unit";
title2 "TABLE: Comparison of GPS data and call record data";
title3 "ROW2: Excluding Appointments";
proc surveyfreq data=routes_only2 rate=0.025;
tables callrec;
cluster tripid;
where trav_to_appt=0;
run;

title1 "Percent of Routes that pass a sampled housing unit";
title2 "TABLE: Comparison of GPS data and call record data";
title3 "ROW3: Excluding Apartments";
proc surveyfreq data=routes_only2 rate=0.025;
tables callrec;
cluster tripid;
where apt=0;
run;

title1 "Percent of Routes that pass a sampled housing unit";
title2 "TABLE: Comparison of GPS data and call record data";
title3 "ROW4: Excluding routes less than 15m";
proc surveyfreq data=routes_only2 rate=0.025;
tables callrec;
cluster tripid;
where stationary=0;
run;

title1 "Percent of active sample housing passed at least once by an inferred route";
title2 "TABLE: Comparison of GPS data and call record data";
title3 "ROW5: All active sampled housing units";
proc surveyfreq data=oneperhh2 rate=0.025;
tables callrec;
cluster tripid;
run;

title1 "Percent of active sample housing passed at least once by an inferred route";
title2 "TABLE: Comparison of GPS data and call record data";
title3 "ROW6: Excluding apartments";
proc surveyfreq data=oneperhh2 rate=0.025;
tables callrec;
cluster tripid;
where apt=0;
run;

title1 "Speed";
title2 "TABLE: Comparison of GPS data and call record data";
title3 "ROW7: All routes";
proc surveyreg data=routes_only2 rate=0.025;
class callrec;
model calcspd=callrec /noint solution vadjust=none;
cluster tripid;
run;


title1 "Speed";
title2 "TABLE: Comparison of GPS data and call record data";
title3 "ROW8: Excluding appointments";
proc surveyreg data=routes_only2 rate=0.025;
class callrec;
model calcspd=callrec /noint solution vadjust=none;
cluster tripid;
where trav_to_appt=0;
run;


ods rtf close;
